# laoding 
library(tidyverse)
library(dplyr)
library(plyr)
library(readxl)
library(ggplot2)
library(gridExtra)
library(reshape2)
library(scales)
library(devtools)
library(plotly)
library(data.table)

Chicago Public Schools Data Explore

The following graphs present CPS, including enrollment, demographic, performance metrics, etc.

Overview of the CPS enrollment

Chicago Public Schools (CPS), currently contains 479 elementary schools, and 165 high schools in the school district. Given the number of students enrolled in school year 2018-2019, 76.6% of the population are economically disadvantages students, 18.7% are English language learners, and 14.1% are students with individualized education programs (basically students with disabilities).


First

# grpah 1: enrollment
# function - generate new variables 
gen_var <- function(df, year){
  df$year <- year 
  df$kindergarten <- df["PE"] + df["PK"] + df["K"]
  df$elementary <- df["01"] + df["02"] + df["03"] + df["04"] + 
    df["05"] + df["06"] + df["07"] + df["08"]
  df$high <- df["09"] + df["10"] + df["11"] + df["12"]  
  var_list <- c('year', 'kindergarten', 'elementary', 'high')
  df <- df[var_list]
  df <- sapply( df, as.numeric )
  return(df)
}


# read in files
enroll_2019 <- 
  read_excel("enrollment/Demographics_20thDay_2019.xls", sheet = "Schools") 
enroll_2019 <- 
  enroll_2019[enroll_2019$"School Name" == "District Total 2018-2019",]
enroll_2019 <- gen_var(enroll_2019, 2019)


enroll_2018 <-
  read_excel("enrollment/Demographics_20thDay_2018.xls", sheet = "Schools")
enroll_2018 <-
  enroll_2018[enroll_2018$"School Name" == "District Total 2017-2018",]
enroll_2018 <- gen_var(enroll_2018, 2018)


enroll_2017 <- 
  read_excel("enrollment/Demographics_20thDay_2017.xls", sheet = "Schools")
enroll_2017 <-
  enroll_2017[enroll_2017$"School Name" == "District Total 2016-2017",]
enroll_2017 <- gen_var(enroll_2017, 2017)


enroll_2016 <- 
  read_excel("enrollment/Demographics_20thDay_2016.xls", sheet = "Sheet1")
enroll_2016 <-
  enroll_2016[enroll_2016$"Network" == "District Totals",]
enroll_2016 <-
  enroll_2016[rowSums( is.na(enroll_2016) ) <= 10, ]
enroll_2016 <- gen_var(enroll_2016, 2016)


enroll_2015 <- 
  read_excel("enrollment/Demographics_20thDay_2015.xls", sheet = "Sheet1")
enroll_2015 <-
  enroll_2015[enroll_2015$"Network" == "District Totals",]
enroll_2015 <-
  enroll_2015[rowSums( is.na(enroll_2015) ) <= 10, ]
enroll_2015 <- gen_var(enroll_2015, 2015)


enroll_2014 <- 
  read_excel("enrollment/Demographics_20thDay_2014.xls", sheet = "enrollment_20th_day_2014")
enroll_2014 <-
  enroll_2014[enroll_2014$"Network" == "District Totals",]
enroll_2014 <-
  enroll_2014[rowSums( is.na(enroll_2014) ) <= 10, ]
enroll_2014 <- gen_var(enroll_2014, 2014)


enroll_2013 <- 
  read_excel("enrollment/Demographics_20thDay_2013.xls", sheet = "enrollment_20th_day_2013")
enroll_2013 <-
  enroll_2013[enroll_2013$"Network" == "District Total",]
enroll_2013 <-
  enroll_2013[rowSums( is.na(enroll_2013) ) <= 10, ]
enroll_2013 <- gen_var(enroll_2013, 2013)


enroll_2012 <- 
  read_excel("enrollment/Demographics_20thDay_2012.xls", sheet = "enrollment_20th_day_2012")
enroll_2012 <-
  enroll_2012[enroll_2012$"Network" == "District Totals",]
enroll_2012 <-
  enroll_2012[rowSums( is.na(enroll_2012) ) <= 10, ]
enroll_2012 <- gen_var(enroll_2012, 2012)


enroll_2011 <- 
  read_excel("enrollment/Demographics_20thDay_2011.xls", sheet = "enrollment_20th_day")
enroll_2011 <-
  enroll_2011[enroll_2011$"Area" == "District Totals",]
enroll_2011 <-
  enroll_2011[rowSums( is.na(enroll_2011) ) <= 10, ]
enroll_2011 <- gen_var(enroll_2011, 2011)


enroll_2010 <- 
  read_excel("enrollment/Demographics_20thDay_2010.xls", sheet = "Sheet1")
enroll_2010 <-
  enroll_2010[enroll_2010$"Area" == "District Totals",]
enroll_2010 <-
  enroll_2010[rowSums( is.na(enroll_2010) ) <= 10, ]
enroll_2010 <- gen_var(enroll_2010, 2010)


enroll_2009 <- 
  read_excel("enrollment/Demographics_20thDay_2009.xls", sheet = "Query1")
enroll_2009 <- 
  enroll_2009[enroll_2009$"Area" == "Dsitrict Totals",]
enroll_2009 <-
  enroll_2009[rowSums( is.na(enroll_2009) ) <= 10, ]
enroll_2009 <- gen_var(enroll_2009, 2009)


enroll_2008 <- 
  read_excel("enrollment/Demographics_20thDay_2008.xls", sheet = "Sheet1")
enroll_2008 <- 
  enroll_2008[enroll_2008$"Area" == "District Totals",]
enroll_2008 <-
  enroll_2008[rowSums( is.na(enroll_2008) ) <= 10, ]
enroll_2008$K <- enroll_2008["Full-Day\nK"] + enroll_2008["Half-Day\nK"]
enroll_2008$"02" <- enroll_2008["02'"]
enroll_2008 <- gen_var(enroll_2008, 2008)

enroll_2007 <- 
  read_excel("enrollment/Demographics_20thDay_2007.xls", sheet = "Sheet1")
enroll_2007 <- 
  enroll_2007[enroll_2007$"Area" == "District Totals",]
enroll_2007 <-
  enroll_2007[rowSums( is.na(enroll_2007) ) <= 10, ]
enroll_2007$PE <- enroll_2007["Head\nStart"]
enroll_2007$PK <- enroll_2007["Other\nPK"] + enroll_2007["State\nPK"] + enroll_2007["PK\nSPED"]
enroll_2007$K <- enroll_2007["Full-Day\nK"] + enroll_2007["Half-Day\nK"]
enroll_2007 <- gen_var(enroll_2007, 2007)


enroll_2006 <- 
  read_excel("enrollment/Demographics_20thDay_2006.xls", sheet = "enrollment_0608")
enroll_2006 <- 
  enroll_2006[enroll_2006$"Area" == "District Totals",]
enroll_2006 <-
  enroll_2006[rowSums( is.na(enroll_2006) ) <= 10, ]
enroll_2006$PE <- enroll_2006["Head\nStart"]
enroll_2006$PK <- enroll_2006["Other\nPK"] + enroll_2006["State\nPK"] + enroll_2006["PK\nSPED"]
enroll_2006$K <- enroll_2006["Full-Day\nK"] + enroll_2006["Half-Day\nK"]
enroll_2006 <- gen_var(enroll_2006, 2006)


enroll_all = bind_rows(enroll_2019, enroll_2018, enroll_2017, enroll_2016, enroll_2015,
                    enroll_2014, enroll_2013, enroll_2012, enroll_2011, enroll_2010,
                    enroll_2009, enroll_2008, enroll_2007, enroll_2006) 
enroll_all$'total population' <- enroll_all$kindergarten + enroll_all$elementary + enroll_all$high


enroll_all <- enroll_all[c("year", "total population", "kindergarten", "elementary", "high")]
colnames(enroll_all) <- c("year", "Total Population", "Kindergarten", "Elementary School", "High School")


enroll_all <- melt(enroll_all, id.var="year")
colnames(enroll_all) <- c("Year", "Student_Type", "Headcount")
# draw graph
enrollment <- 
  ggplot(enroll_all, aes(x= Year, y = Headcount)) +
  geom_point(aes(color=Student_Type)) +
  geom_line(aes(color=Student_Type)) +
  geom_text(data=subset(enroll_all,Year == 2006), aes(label = Headcount), size = 3, vjust = 2, hjust = 0.3) +  
  geom_text(data=subset(enroll_all,Year == 2019), aes(label = Headcount), size = 3, vjust = -0.9, hjust = 0.5) +  
  geom_text(data=subset(enroll_all,Year == 2010), aes(label = Headcount), size = 3, vjust = 2, hjust = 0.3) +  
  geom_text(data=subset(enroll_all,Year == 2015), aes(label = Headcount), size = 3, vjust = -0.9, hjust = 0.5) +  
  facet_wrap( ~ Student_Type, scales = "free_y",  nrow = 4, 
              labeller = as_labeller(c("High School" = "High School Enrollment Dropped by 3265", 
                                       "Elementary School" = "Elementary School Enrollment Dropped by 47492", 
                                       "Total Population" ="Total Enrollment Dropped by 59611" ,
                                       "Kindergarten" = "Kindergarten Enrollment Dropped by 8854"))) +
  scale_x_continuous(breaks=seq(2006, 2019, 1)) +
  scale_color_manual(values = c("High School" = "#F59AA3", "Elementary School" = "#ffa45c", 
                                "Total Population" ="#34a7b2" ,"Kindergarten" = "#5b2e35")) +
  xlab("Year") + ylab("Enrollment Headcount") + 
  theme_minimal() +
  labs(
    title = "Chicago Public Schools Enrollment Drops by 60,000 Students in the Past 14 Years ", 
    subtitle = "Enrollment drops for all types of students, from kindergarten to high school",
    caption = "CPS School Data Report: 2006-2019 20th Day Membership") +
  theme(
    plot.title = element_text(size = 18, hjust = 0.5, face = "bold", family = "Concert One"),
    plot.subtitle = element_text(size = 14, hjust = 0.5, family = "Bitter"),
    plot.caption = element_text(size = 12, hjust = 1, family = "Lobster"), 
    axis.title.x = element_text(size=12, face="bold"), 
    axis.title.y = element_text(size=12, face="bold"),
    strip.text.x = element_text(size = 12, face="bold", color = "#3c4f65"),
    panel.background = element_blank(),
    panel.grid.major.y = element_line(size = 0.2, linetype = 'solid',
                                    colour = "lightgray"), 
    axis.text.y = element_blank(),
    legend.position = "bottom",
    legend.spacing.x = unit(0.5, 'cm'),
    legend.text = element_text(size=10, face="bold"),
    legend.title = element_blank())

enrollment 

This graphs shows that, over the past 10 year, CPS has experienced drop in enrollment. Enrollments drop throughout all its schools, from kindergartens, elementary schools, to high schools. Noteworthy, the 2019 (20th day enrollment) total population has dropped almost 50,000 compared to 2010, from 409279 to 361314. Within each student type, the biggest drop comes from elementary school students which also serve as the majotiry group of the population.

# graph 2

# prepare data
column_name_1 <- c('type', 'total',
                   'white', 'w_per', 'african american', 'a_per', 'pacific', 'p_per', 
                   'native american', 'n_per', 'hispanic', 'h_per', 'multi', 'm_per', 
                   'asian', 'as_per', 'hawaiian', 'ha_per', 'na', 'na_per')

column_name_2 <- c('type', 'total',
                   'white', 'w_per', 'african american', 'a_per', 'native american', 'n_per',
                   'pacific', 'p_per', 'hispanic', 'h_per')

column_name_3 <- c('type', 'total',
                   'white', 'w_per', 'african american', 'a_per', 'native american', 'n_per',
                   'pacific', 'p_per', 'hispanic', 'h_per', 'multi', 'm_per')


# function - generate new variables 
gen_var <- function(df, year, column){
  df <- df[rowSums(is.na(df)) < 10, ]
  colnames(df) <- column 
  df$type <- NULL
  df$Year <- year 
  df$African_American <- as.numeric(df["african american"]) / as.numeric(df["total"]) * 100
  df$Hispanic <- as.numeric(df["hispanic"]) / as.numeric(df["total"]) * 100
  df$White <- as.numeric(df["white"]) / as.numeric(df["total"]) * 100
  if (("asian" %in% names(df)) && ("multi" %in% names(df)))
  {
    df$Asian <- as.numeric(df["asian"]) / as.numeric(df["total"]) * 100;
    df$Other <- (as.numeric(df["pacific"]) + as.numeric(df["native american"]) + as.numeric(df["multi"]) + 
                   as.numeric(df["hawaiian"]) + as.numeric(df["na"])) / as.numeric(df["total"]) * 100;
  }
  else if ((!"asian" %in% names(df)) && (!"multi" %in% names(df)))
  {
    df$Asian <- 0;
    df$Other <- (as.numeric(df["pacific"]) + as.numeric(df["native american"])) / as.numeric(df["total"]) * 100;
  }
  else if  ((!"asian" %in% names(df)) && ("multi" %in% names(df)))
  {
    df$Asian <- 0;
    df$Other <- (as.numeric(df["pacific"]) + as.numeric(df["native american"]) + as.numeric(df["multi"])) / as.numeric(df["total"]) * 100
  }
  var_list <- c('African_American', 'Hispanic', 'White', 'Asian', 'Other', 'Year')
  df <- df[var_list]
  return(df)
}


# read in files
race_2019 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2019.xls", sheet = "Grades", skip = 1) 
race_2019 <- 
  race_2019[race_2019$"Grade Level" == "District Total",]
race_2019 <- gen_var(race_2019, 2019, column_name_1)



race_2018 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2018.xls", sheet = "Grades", skip = 1) 
race_2018 <- 
  race_2018[race_2018$"Grade Level" == "District Total",]
race_2018 <- gen_var(race_2018, 2018, column_name_1)


race_2017 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2017.xls", sheet = "Grades", skip =1) 
race_2017 <- 
  race_2017[race_2017$"Grade Level" == "District Total",]
race_2017 <- gen_var(race_2017, 2017, column_name_1)


race_2016 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2016.xls", sheet = "Grades", skip =1) 
race_2016 <- 
  race_2016[race_2016$"Grade Level" == "District Totals",]
race_2016 <- gen_var(race_2016, 2016, column_name_1)


race_2015 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2015.xls", sheet = "Grades", skip =1) 
race_2015 <- 
  race_2015[race_2015$"Grade Level" == "District Totals",]
race_2015 <- gen_var(race_2015, 2015, column_name_1)


race_2014 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2014.xls", sheet = "Grades", skip =1) 
race_2014 <- 
  race_2014[race_2014$"Grade Level" == "District Totals",]
race_2014 <- gen_var(race_2014, 2014, column_name_1)

race_2013 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2013.xls", sheet = "Grades", skip =1) 
race_2013 <- 
  race_2013[race_2013$"Grade Level" == "District Totals",]
race_2013 <- gen_var(race_2013, 2013, column_name_1)

race_2012 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2012.xls", sheet = "Grades", skip =1) 
race_2012 <- 
  race_2012[race_2012$"Grade Level" == "District Totals",]
race_2012 <- gen_var(race_2012, 2012, column_name_1)


race_2011 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2011.xls", sheet = "Grades", skip =1) 
race_2011 <- 
  race_2011[race_2011$"..1" == "District Totals",]
race_2011 <- gen_var(race_2011, 2011, column_name_1)


race_2010 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2010.xls", sheet = "Grades", skip =1) 
race_2010 <- 
  race_2010[race_2010$"..1" == "Dsitrict Totals",]
race_2010 <- gen_var(race_2010, 2010, column_name_2)



race_2009 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2009.xls", sheet = "Grades", skip =1) 
race_2009 <- 
  race_2009[race_2009$"..1" == "District Totals",]
race_2009 <- gen_var(race_2009, 2009, column_name_2)



race_2008 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2008.xls", sheet = "Grades", skip =1, range = cell_cols("A:N")) 
race_2008 <- 
  race_2008[race_2008$"..1" == "Grand Total",]
race_2008 <- gen_var(race_2008, 2008, column_name_3)


race_2007 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2007.xls", sheet = "Totals_by_Grades", skip =1, range = cell_cols("A:N")) 
race_2007 <- 
  race_2007[race_2007$"..1" == "Grand Total",]
race_2007 <- gen_var(race_2007, 2007, column_name_3)


race_2006 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2006.xls", sheet = "Totals by Grade", skip =1, range = cell_cols("A:N")) 
race_2006 <- 
  race_2006[race_2006$"..1" == "GRAND TOTAL",]
race_2006 <- gen_var(race_2006, 2006, column_name_3)


race_2005 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2005.xlsx", sheet = "School Types", skip =1, range = cell_cols("B:M")) 
race_2005 <- 
  race_2005[race_2005$"..1" == "Grand Total",]
race_2005 <- gen_var(race_2005, 2005, column_name_2)



race_2004 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2004.xls", sheet = "Totals by Types", skip =1, range = cell_cols("B:M")) 
race_2004 <- 
  race_2004[race_2004$"..1" == "Grand Total",]
race_2004 <- gen_var(race_2004, 2004, column_name_2)


race_2003 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2003.xls", sheet = "Totals by Type", skip =1, range = cell_cols("B:M")) 
race_2003 <- 
  race_2003[race_2003$"..1" == "Grand Total",]
race_2003 <- gen_var(race_2003, 2003, column_name_2)


race_2002 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2002.xls", sheet = "Totals by Types", skip =1, range = cell_cols("B:M")) 
race_2002 <- 
  race_2002[race_2002$"..1" == "Grand Total",]
race_2002 <- gen_var(race_2002, 2002, column_name_2)


race_2001 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2001.xls", sheet = "Totals by Type", skip =1, range = cell_cols("B:M")) 
race_2001 <- 
  race_2001[race_2001$"..1" == "Grand Total",]
race_2001 <- gen_var(race_2001, 2001, column_name_2)


race_2000 <- 
  read_excel("demo_racial/Demographics_RacialEthnic_2000.xls", sheet = "Totals by Type", skip =1, range = cell_cols("B:M")) 
race_2000 <- 
  race_2000[race_2000$"..1" == "Totals",]
race_2000 <- gen_var(race_2000, 2000, column_name_2)


race = bind_rows(race_2019, race_2018, race_2017, race_2016, race_2015,
                       race_2014, race_2013, race_2012, race_2011, race_2010, race_2009,
                       race_2008, race_2007, race_2006, race_2005, race_2004, race_2003,
                       race_2002, race_2001, race_2000) 

race$African_American <- -(race$African_American)
race <- race[c('African_American','White',  'Hispanic', 'Year')]

race <- melt(race, id.var="Year")
colnames(race) <- c("Year", "Ethnicity", "Percentage")
race$Percentage <- round(race$Percentage, digits = 2) 
race$Year <- as.numeric(race$Year)
#draw graph
race_bar <- ggplot(race, aes(x= Year, y = Percentage, group = Ethnicity, 
                             fill = factor(Ethnicity, levels = c('African_American','Hispanic',  'White')), 
                             label = sprintf("%0.2f", round(Percentage, digits = 2)))) +
  geom_bar(stat = "identity", width = 0.7, alpha = 0.95) + 
  geom_text(data=subset(race, Ethnicity == 'African_American'), aes(label = sprintf("%0.2f", round(abs(Percentage), digits = 2))), 
            size = 3.5, position = position_stack(vjust = 0.3)) +
  geom_text(data=subset(race, Ethnicity != 'African_American'), size = 3.5, position = position_stack(vjust = 0.7)) +

  coord_flip() +
  scale_x_discrete(limits = rev(race$Year), expand = c(0, 0)) +
  scale_fill_manual(values=c("#BBC7BA","#F9D5D3","#C1DAE0")) +
  scale_y_continuous(breaks = (seq(-60, 60, 10)), 
                     labels = abs(seq(-60, 60, 10)),
                     expand = c(0.01, 0)) +
  labs(
    title = "Growing Hispanic Population, Shrinking African American Population", 
    subtitle = "More than 80% Chicago Public Schools Students are African American and Hispanic Students", 
    caption = "CPS School Data Report: 2000-2019 Racial/Ethnic") +
  theme(
    plot.title = element_text(size = 18, hjust = 0.5, face = "bold", family = "Concert One"),
    plot.subtitle = element_text(size = 14, hjust = 0.5, family = "Bitter"),
    plot.caption = element_text(size = 12, hjust = 1, family = "Lobster"), 
    axis.title.x = element_text(size=14, face="bold", family =  "Crimson Text" ), 
    axis.title.y = element_text(size=14, face="bold", family =  "Crimson Text" ),
    strip.text.x = element_text(size = 10, face="bold"),
    panel.background = element_blank(),
    legend.position = "bottom",
    legend.spacing.x = unit(0.5, 'cm'),
    legend.text = element_text(size=13, face="bold", family = "Crimson Text" ),
    legend.title = element_blank())

race_bar

This graphs shows the make up of race/ethnicity – White, African American, and Hispanic percentage of CPS student bodys from 2000 - 2019. As a public school district in a big metropolitan area, CPS contains more than 80% of African American and Hispanic students. Over the last 20 years, the percentage of Hispanic students has been growing, and the percentage of African American Students has been shrinking. White students, however, counts for less than 10% of the total population for most of the years. In recent years, there is a slightly shift toward having more white students, and more students with other race/ethnicity which are majority Asian students.

# graph 3: race/ethnicity 

# prepare data 
# 2019 data
demo_2019 <- read_excel("demo_special/Demographics_LEPSPED_2019.xls", sheet = "Networks", range = cell_rows(4:25),
                        col_names = c("Network", "Population", "Bi_no", "Bi_per", "SpEd_no", 
                                      "SpEd_per", "FreeLunch_no", "FreeLunch_per"))
demo_2019$year <- rep(2019,nrow(demo_2019))

# 2018 data
demo_2018 <- read_excel("demo_special/Demographics_LEPSPED_2018.xls", sheet = "Networks", range = cell_rows(4:21),
                        col_names = c("Network", "Population", "Bi_no", "Bi_per", "SpEd_no", 
                                      "SpEd_per", "FreeLunch_no", "FreeLunch_per"))
demo_2018$year <- rep(2018,nrow(demo_2018))

# 2017 data
demo_2017 <- read_excel("demo_special/Demographics_LEPSPED_2017.xls", sheet = "Networks", range = cell_rows(4:22),
                        col_names = c("Network", "Population", "Bi_no", "Bi_per", "SpEd_no", 
                                      "SpEd_per", "FreeLunch_no", "FreeLunch_per"))
demo_2017$year <- rep(2017,nrow(demo_2017))

# combine dataset from all years
demo_all = bind_rows(demo_2019, demo_2018, demo_2017) 
demo_all = demo_all[demo_all$Bi_per >= 0.15,]

# rename cell 
demo_all$Network <- gsub("Service Leadership Academies", "SLA", demo_all$Network)

# convert values to numeric and percentage
demo_all$Bi_per <- as.numeric(as.character(demo_all$Bi_per)) * 100
demo_all$SpEd_per <- as.numeric(as.character(demo_all$SpEd_per)) * 100
demo_all$FreeLunch_per <- as.numeric(as.character(demo_all$FreeLunch_per)) * 100
# draw graph
lunch_bi <- ggplot(demo_all,  aes(x = FreeLunch_per, y = Bi_per)) +
  geom_point(alpha = 1, aes(color=Network), size = 3) +
  geom_smooth(method='lm',formula=y~x, se = FALSE) +
  geom_hline(data=subset(demo_all, year == 2019), 
             aes(yintercept = mean(Bi_per), group = year), linetype="dashed", color = "#f25f5c", size=.5) +
  geom_hline(data=subset(demo_all, year == 2018), 
             aes(yintercept = mean(Bi_per), group = year), linetype="dashed", color = "#f25f5c", size=.5) +
  geom_hline(data=subset(demo_all, year == 2017), 
             aes(yintercept = mean(Bi_per), group = year), linetype="dashed", color = "#f25f5c", size=.5) +
  geom_vline(data=subset(demo_all, year == 2019), 
             aes(xintercept = mean(FreeLunch_per), group = year), linetype="dashed", color = "#5ed7bf", size=.5) +
  geom_vline(data=subset(demo_all, year == 2018), 
             aes(xintercept = mean(FreeLunch_per), group = year), linetype="dashed", color = "#5ed7bf", size=.5) +
  geom_vline(data=subset(demo_all, year == 2017), 
             aes(xintercept = mean(FreeLunch_per), group = year), linetype="dashed", color = "#5ed7bf", size=.5) +
  facet_wrap( ~ year, nrow =1, labeller = as_labeller(c("2017" = "FY 1617", 
                                                        "2018" = "FY 1718", 
                                                        "2019" = "FY 1819"))) +
  scale_color_manual(values = c("Charter" = "#F59AA3", 
                                "Network 1" = "#ffa45c", 
                                "Network 2" =  "#34a7b2",
                                "Network 3" = "#5b2e35",
                                "Network 4" = "#a7d7c5",
                                "Network 6" = "#ffe0e0",
                                "Network 7" = "#caabd8",
                                "Network 8" = "#fffa67",
                                "Network 10" = "#a2eae2",
                                "ISP" = "#b5525c")) +
  coord_fixed(ratio = 1.8) +
  xlab("% Free/Reduced Lunch") + ylab("% Bilingual") +
  xlim(50, 95) + ylim(5, 50) +
  annotate("label",  x = 50, y = 48,  label = "Green: avg for Free Lunch \nRed: avg for Bilingual", size = 3, hjust = 0) +
  labs(
    title = "Networks with More Bilingual Population are also Networks \n with more Economically Disadvantaged Population",
    subtitle = "Distributions of 2017-2019, only for Networks' with more than 15% bilingual population", 
    caption = "CPS School Data Report: 2017-2019 Limited English Proficiency, Special Ed, Low Income, IEP") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, hjust = 0.5, face = "bold", family = "Concert One"),
    plot.subtitle = element_text(size = 14, hjust = 0.5, family = "Bitter"),
    plot.caption = element_text(size = 12, hjust = 1, family = "Lobster"), 
    axis.title.x = element_text(size=12, face="bold"), 
    axis.title.y = element_text(size=12, face="bold"),
    panel.grid.major.y = element_line(size = 0.2, linetype = 'solid',
                                    colour = "lightgray"), 
    panel.grid.major.x = element_line(size = 0.2, linetype = 'solid',
                                    colour = "lightgray"), 
    panel.background = element_blank(),
    legend.position = "bottom",
    legend.spacing.x = unit(0.5, 'cm'),
    legend.text = element_text(size=10, face="bold"),
    legend.title = element_blank(), 
    strip.text.x = element_text(size = 15, face="bold", color = "#3c4f65"))

lunch_bi

This graphs shows the relationship between % free lunch program enrollment and % bilingual popualtion among CPS Networks (breakdown mostly by location). Blue lines indicate the fitted condition, and red dotted lines indicate the mean level for % bilingual popluation among different years, and green dotted lines indicate the mean levle for % reduced/free lunch population among different years. Here, we would like to use % reduced/free lunch as an indicator for low income students. Therefore, the grpahs presents that Networks with more bilingual population are also Networks with more economically disadvantaged population. Especially for Network 7 and Network 8, contains schools in Midway and Pilsen Litter Village.

# graph 4
# read in files
SQRP <- read_excel("Accountability_SQRPratings_2018-2019_SchoolLevel.xls", sheet = "High Schools (grds 9-12 only)", 
                   skip = 1)
SQRP <- SQRP[ , which(names(SQRP) %in% c("School ID", "School Name", "SQRP Total Points Earned", 
                                         "4-Year Cohort Graduation Rate",  "Average Daily Attendance Rate", "College Enrollment Rate"))]
SQRP <- SQRP[complete.cases(SQRP), ]  
names(SQRP) <- c("ID", "Name", "SQRP_Score", "Graduation", "College_enroll",  "Attendance")

SQRP$Graduation <- as.numeric(as.character(SQRP$Graduation))
SQRP$Attendance <- as.numeric(as.character(SQRP$Attendance)) 
SQRP$College_enroll <- as.numeric(as.character(SQRP$College_enroll)) 

SQRP <- SQRP[SQRP$Graduation!=0 & SQRP$Attendance!=0 & SQRP$College_enroll!=0, ]
# draw graph
sqrp_grad_attend <- ggplot(SQRP, aes(x = Graduation, y = Attendance, size = College_enroll, fill = SQRP_Score)) +
  geom_point(shape = 21) + 
  xlab("% 4-Year Cohort Graduation Rate") + ylab("% Average Daily Attendance Rate") +
  labs(size = "% College Enrollment Rate", fill = "School Quality Rating") +
  scale_x_continuous(limits=c(20, 100), breaks=c(20, 30, 40, 50, 60, 70, 80, 90, 100)) +
  scale_y_continuous(limits=c(70, 100), breaks=c(70, 75, 80, 85, 90, 95, 100)) +
  scale_size(range = c(0,6),
             breaks = c(30, 40, 50, 60, 70, 80, 90, 100),
             labels = c(30, 40, 50, 60, 70, 80, 90, 100)) +
  labs(
    title = "High School SQRP Ratings are Heavily Determined by \n Graduation, Attendance, and College Enrollment", 
    subtitle = "CPS FY1819 High School SQRP Ratings vs. Graduation, Attendance, and College Enrollment", 
    caption = "CPS School Data Report: 2019 School Quality Rating Policy Results and Accountability Status
       *Outlier removed for High School with missing values and extreme values") +
  theme(
    plot.title = element_text(size = 18, hjust = 0.5, face = "bold", family = "Concert One"),
    plot.subtitle = element_text(size = 14, hjust = 0.5, family = "Bitter"),
    plot.caption = element_text(size = 12, hjust = 1, family = "Lobster"), 
    axis.title.x = element_text(size=12, face="bold"), 
    axis.title.y = element_text(size=12, face="bold"),
    strip.text.x = element_text(size = 10, face="bold"),
    panel.background = element_blank()) + 
  theme(legend.position = "bottom", legend.direction = "horizontal") 
   
sqrp_grad_attend

This graphs shows how CPS high school quality rating policy results (SQRP) are distributed. For each school, SQRP score lies with in any number between 0 - 4. As indicates from the graph, the lighter the color of the bubble, the high the SQRP score a school earns. On the meantime, this graph also shows the performance metrics for each school, including their high school graduation rate (observe through the x-axis), daily attendance rate (observe through the y-sxis), and college enrollment rate (observe through the size of the bubble, the bigger the bubble, the higher the college enrollment rate). Therefore, this grpah concludes that high school SQRP ratings are heavily determined by graduation, attendance, and college enrollment.

# read in file 
progress_2019 <- read_csv("progress_report/Chicago_Public_Schools_-_School_Progress_Reports_SY1819.csv", col_names = TRUE)
progress_2019 <- select(progress_2019, School_ID, Short_Name, starts_with('NWEA'))
progress_2019 <- select(progress_2019, School_ID, Short_Name, ends_with('Pct'))
progress_2019 <- select(progress_2019, School_ID, Short_Name, contains('Growth'))
progress_2019 <- progress_2019[complete.cases(progress_2019), ]
colnames(progress_2019) <- c("ID", "Name", "Reading_3", "Reading_4", "Reading_5", "Reading_6", "Reading_7", "Reading_8", 
                             "Math_3", "Math_4", "Math_5", "Math_6", "Math_7", "Math_8")

progress_2019 <- melt(progress_2019, id=c("ID","Name"))
progress_2019$subject <- ifelse(grepl("Math", progress_2019$variable), "Math", "Reading")
progress_2019$variable <- gsub('Math_3', '3', progress_2019$variable)
progress_2019$variable <- gsub('Reading_3', '3', progress_2019$variable)
progress_2019$variable <- gsub('Math_4', '4', progress_2019$variable)
progress_2019$variable <- gsub('Reading_4', '4', progress_2019$variable)
progress_2019$variable <- gsub('Math_5', '5', progress_2019$variable)
progress_2019$variable <- gsub('Reading_5', '5', progress_2019$variable)
progress_2019$variable <- gsub('Math_6', '6', progress_2019$variable)
progress_2019$variable <- gsub('Reading_6', '6', progress_2019$variable)
progress_2019$variable <- gsub('Math_7', '7', progress_2019$variable)
progress_2019$variable <- gsub('Reading_7', '7', progress_2019$variable)
progress_2019$variable <- gsub('Math_8', '8', progress_2019$variable)
progress_2019$variable <- gsub('Reading_8', '8', progress_2019$variable)
# draw graph
progress <- ggplot(progress_2019, aes(x= variable, y = value)) +
  geom_violin(trim = TRUE)+
  geom_jitter(position=position_jitter(0.1),  
              alpha = 0.5, 
              aes(color = subject == "Reading")) +
  geom_hline(yintercept = 50,  linetype="dashed", color = "red") +
  facet_wrap( ~ subject,nrow = 1) +
  stat_summary(fun.y=median, geom="line", aes(group=1))  + 
  stat_summary(fun.y=median, geom="point") +
  scale_color_manual(labels = c("Math", "Reading"), 
                     values = c("TRUE" = "#FBF4B1", "FALSE" = "#FFCBCB")) +
  xlab("Grades") + ylab("NWEA Growth (50 Stays Same)") + 
  scale_y_continuous(expand = c(0, 0)) +
  annotate("label",  x = 6, y = 70,  label = "Median") +
  annotate("text",  x = 5.5, y = 50,  label = "National Average") +
  labs(
    title = "CPS Students are Making Progress in both Math and Reading \n especially for Grade 7 and 8", 
    subtitle = "SY1819, NWEA Growth for Math and Reading for Students in Grade 3 - 8", 
    caption = "City of Chicago Data Portal: 2019 School Progress Reports",
    color = "Subject") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, hjust = 0.5, face = "bold", family = "Concert One"),
    plot.subtitle = element_text(size = 14, hjust = 0.5, family = "Bitter"),
    plot.caption = element_text(size = 12, hjust = 1, family = "Lobster"), 
    axis.title.x = element_text(size=12, face="bold"), 
    axis.title.y = element_text(size=12, face="bold"),
    panel.background = element_blank(),
    legend.position = "bottom",
    legend.spacing.x = unit(0.5, 'cm'),
    legend.text = element_text(size=10, face="bold"),
    legend.title = element_blank(), 
    strip.text.x = element_text(size = 15, face="bold", color = "#3c4f65"))

progress

This graphs shows how 402 CPS elementary school students are making progress in both of their NEWA math and reading attainment exams. Growth measures the change in between two points in time. This growth is compared to the average national growth for schools that started in the same place. A 50th percentile score means the school grew at the same rate as the national average. The black lines show the median of Growth among CPS students of different grades. We can observe that, median level of 7th grade and 8th grade in both math and reading are exceeding the national average. Despite the fact that Growth for different elementary schools varied a lot, and there are schools with very low Growth performance.

# Graph 6

# prepare data
filter_column <- function(df){
  df <- select(df, 
               contains('School_Survey'), 
               -ends_with('Pct'),
               -ends_with('Description'))
  return(df)
}

generate_count <- function(df, year){
  
  Involved_Families <- count(df, "School_Survey_Involved_Families")
  Involved_Families$type <- 'Involved Families'
  colnames(Involved_Families) <- c("degree", "count", "type")
  
  Supportive_Environment <- count(df, "School_Survey_Supportive_Environment")
  Supportive_Environment$type <- 'Supportive Environment'
  colnames(Supportive_Environment) <- c("degree", "count", "type")
  
  Ambitious_Instruction <- count(df, "School_Survey_Ambitious_Instruction")
  Ambitious_Instruction$type <- 'Ambitious Instruction'
  colnames(Ambitious_Instruction) <- c("degree", "count", "type")
  
  Effective_Leaders <- count(df, "School_Survey_Effective_Leaders")
  Effective_Leaders$type <- 'Effective Leaders'
  colnames(Effective_Leaders) <- c("degree", "count", "type")
  
  Collaborative_Teachers <- count(df, "School_Survey_Collaborative_Teachers")
  Collaborative_Teachers$type <- 'Collaborative Teachers'
  colnames(Collaborative_Teachers) <- c("degree", "count", "type")
  
  Safety <- count(df, "School_Survey_Safety")
  Safety$type <- 'Safety'
  colnames(Safety) <- c("degree", "count", "type")
  
  School_Community <- count(df, "School_Survey_School_Community")
  School_Community$type <- 'School Community'
  colnames(School_Community) <- c("degree", "count", "type")
  
  Parent_Teacher_Partnership <- count(df,"School_Survey_Parent_Teacher_Partnership")
  Parent_Teacher_Partnership$type <- 'Parent Teacher Partnership'
  colnames(Parent_Teacher_Partnership) <- c("degree", "count", "type")
  
  Quality_Of_Facilities <- count(df, "School_Survey_Quality_Of_Facilities")
  Quality_Of_Facilities$type <- 'Quality Of Facilities'
  colnames(Quality_Of_Facilities) <- c("degree", "count", "type")
  
  survey_one_year <- bind_rows(Involved_Families, Supportive_Environment, Ambitious_Instruction, Effective_Leaders, 
                              Collaborative_Teachers, Safety, School_Community, Parent_Teacher_Partnership, Quality_Of_Facilities)
  
  survey_one_year$year <- year
  
  return(survey_one_year)
}

progress_2019 <- read_csv("progress_report/Chicago_Public_Schools_-_School_Progress_Reports_SY1819.csv", col_names = TRUE)
progress_2019 <- filter_column(progress_2019)
progress_2019 <- generate_count(progress_2019, 2019)

progress_2018 <- read_csv("progress_report/Chicago_Public_Schools_-_School_Progress_Reports_SY1718.csv", col_names = TRUE)
progress_2018 <- filter_column(progress_2018)
progress_2018 <- generate_count(progress_2018, 2018)

progress_2017 <- read_csv("progress_report/Chicago_Public_Schools_-_School_Progress_Reports_SY1617.csv", col_names = TRUE)
progress_2017 <- filter_column(progress_2017)
progress_2017 <- generate_count(progress_2017, 2017)

progress_2016 <- read_csv("progress_report/Chicago_Public_Schools_-_School_Progress_Reports_SY1516.csv", col_names = TRUE)
progress_2016 <- filter_column(progress_2016)
progress_2016 <- generate_count(progress_2016, 2016)

survey <- bind_rows(progress_2019, progress_2018, progress_2017, progress_2016)
survey <- survey[c("type", "year", "degree", "count")]
colnames(survey) <- c("group", "year", "degree", "value")
survey <- survey[complete.cases(survey), ]

survey$value <- as.numeric(survey$value)

survey$degree <- revalue(survey$degree, c("Neutral"="NEUTRAL"))
survey$degree <- revalue(survey$degree, c("Strong"="STRONG"))
survey$degree <- revalue(survey$degree, c("Very strong"="VERY STRONG"))
survey$degree <- revalue(survey$degree, c("Very weak"="VERY WEAK"))
survey$degree <- revalue(survey$degree, c("Weak"="WEAK"))
survey$degree <- as.factor(survey$degree)

survey$group <- revalue(survey$group, c("Involved Families"="A"))
survey$group <- revalue(survey$group, c("Supportive Environment"="B"))
survey$group <- revalue(survey$group, c("Ambitious Instruction"="C"))
survey$group <- revalue(survey$group, c("Effective Leaders"="D"))
survey$group <- revalue(survey$group, c("Collaborative Teachers"="E"))
survey$group <- revalue(survey$group, c("Safety"="F"))
survey$group <- revalue(survey$group, c("School Community"="G"))
survey$group <- revalue(survey$group, c("Parent Teacher Partnership"="H"))
survey$group <- revalue(survey$group, c("Quality Of Facilities"="I"))
survey$group <- as.factor(survey$group)

survey_2019 <- survey[(survey$year == '2019'),]
survey_2019$id <- seq.int(nrow(survey_2019))
# draw graph 
# Set a number of 'empty bar' to add at the end of each group
empty_bar=2
to_add = data.frame(matrix(NA, empty_bar*nlevels(survey_2019$group), ncol(survey_2019)) )
colnames(to_add) = colnames(survey_2019)
to_add$group=rep(levels(survey_2019$group), each=empty_bar)
survey_2019=rbind(survey_2019, to_add)
survey_2019=survey_2019 %>% arrange(group)
survey_2019$id=seq(1, nrow(survey_2019))

# Get the name and the y position of each label
label_data=survey_2019
number_of_bar=nrow(label_data)
angle= 90 - 360 * (label_data$id-0.5) /number_of_bar    
label_data$hjust<-ifelse( angle < -90, 1, 0)
label_data$angle<-ifelse(angle < -90, angle+180, angle)

# prepare a data frame for base lines
base_data=survey_2019 %>% 
  group_by(group) %>% 
  summarize(start=min(id), end=max(id) - empty_bar) %>% 
  rowwise() %>% 
  mutate(title=mean(c(start, end)))

# prepare a data frame for grid (scales)
grid_data = base_data
grid_data$end = grid_data$end[ c( nrow(grid_data), 1:nrow(grid_data)-1)] + 1
grid_data$start = grid_data$start - 1

# Make the plot
survey_plot <- ggplot(survey_2019, aes(x=as.factor(id), y=value)) +     
  geom_bar(aes(x=as.factor(id), y=value, fill=degree), stat="identity", alpha=0.8, width = 1) +
  geom_segment(data=grid_data, aes(x = end, y = 100, xend = start, yend = 100), 
               colour = "#C8D9EB", alpha=0.8, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=grid_data, aes(x = end, y = 200, xend = start, yend = 200), 
               colour = "#C8D9EB", alpha=0.8, size=0.3 , inherit.aes = FALSE ) +
  geom_segment(data=grid_data, aes(x = end, y = 300, xend = start, yend = 300), 
               colour = "#C8D9EB", alpha=0.8, size=0.3 , inherit.aes = FALSE ) +
  annotate("text", x = rep(max(survey_2019$id),4), y = c(100, 200, 300, 400), 
           label = c("100", "200", "300", "400") , color="grey", size=3 , angle=0, fontface="bold", hjust=1) +
  scale_fill_manual(values=c("#D3BDA2","#615B59","#DB9A96","#DBB2AF", "#E5CAC5", "#E7DFE0")) +
  ylim(-200,350) +
  coord_polar() + 
  labs(
    title = "Schools are not Promoting Safety and School Community", 
    subtitle = "Schools have Effective Leaders, Collaborative Teachers and Ambitious Instruction", 
    caption = "City of Chicago Data Portal: 2019 School Progress Reports") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, hjust = 0.5, face = "bold", family = "Concert One"),
    plot.subtitle = element_text(size = 14, hjust = 0.5, family = "Bitter"),
    plot.caption = element_text(size = 12, hjust = 1, family = "Lobster"), 
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    legend.position = "bottom") +
  geom_text(data=label_data, aes(x=id, y=value+10, label=value, hjust=hjust), 
            color="black", fontface="bold",alpha=0.6, size=2.5, angle= label_data$angle, inherit.aes = FALSE ) +
  geom_segment(data=base_data, aes(x = start, y = -5, xend = end, yend = -5), 
               colour = "black", alpha=0.8, size=0.6 , inherit.aes = FALSE ) +
  geom_text(data=base_data, aes(x = title, y = -200, label= "Survey Questions"),
            colour = "black", alpha=0.8, size=4, fontface="bold", inherit.aes = FALSE)
 
survey_plot

This graphs shows the result of the Correlation matrix for performance matrix, such as grade, shcool rating, attendance, survey participation, and socio ecnomic status. Each box represents the correlation, range frmo -1 t 1. We found out a Strong Positive Association, SQRP with Math & Reading Attainment; and also a Strong Negetive Association: % Free Lunch & Math & Reading Attainment.